Chip 2006 June

home *** CD-ROM | disk | FTP | other *** search

/ Chip 2006 June / CHIP 2006-06.2.iso / program / freeware / Democracy-0.8.2.exe / xulrunner / python / feed.py < prev next >

Wrap

Python Source | 2006-04-10 | 60.5 KB | 1,782 lines

from downloader import grabURL from HTMLParser import HTMLParser,HTMLParseError import xml from urlparse import urlparse, urljoin from urllib import urlopen from database import defaultDatabase from item import * from scheduler import ScheduleEvent from copy import copy from xhtmltools import unescape,xhtmlify,fixXMLHeader, fixHTMLHeader, toUTF8Bytes, urlencode from cStringIO import StringIO from threading import Thread, Semaphore import traceback #FIXME get rid of this from datetime import datetime, timedelta from inspect import isfunction from new import instancemethod import os import config import re import app whitespacePattern = re.compile(r"^[ \t\r\n]*$") def defaultFeedIconURL(): import resource return resource.url("images/feedicon.png") # Notes on character set encoding of feeds: # # The parsing libraries built into Python mostly use byte strings # instead of unicode strings. However, sometimes they get "smart" and # try to convert the byte stream to a unicode stream automatically. # # What does what when isn't clearly documented # # We use the function toUTF8Bytes() to fix those smart conversions # # If you run into Unicode crashes, adding that function in the # appropriate place should fix it. # Universal Feed Parser http://feedparser.org/ # Licensed under Python license import feedparser # Pass in a connection to the frontend def setDelegate(newDelegate): global delegate delegate = newDelegate # Pass in a feed sorting function def setSortFunc(newFunc): global sortFunc sortFunc = newFunc # # Adds a new feed using USM def addFeedFromFile(file): d = feedparser.parse(file) if d.feed.has_key('links'): for link in d.feed['links']: if link['rel'] == 'start': generateFeed(link['href']) return if d.feed.has_key('link'): addFeedFromWebPage(d.feed.link) # # Adds a new feed based on a link tag in a web page def addFeedFromWebPage(url): feedURL = getFeedURLFromWebPage(url) if not feedURL is None: generateFeed(feedURL) def getFeedURLFromWebPage(url): data = '' info = grabURL(url,"GET") if info is None: return None try: data = info['file-handle'].read() info['file-handle'].close() except: pass return HTMLFeedURLParser().getLink(info['updated-url'],data) # URL validitation and normalization def validateFeedURL(url): return re.match(r"^(http|https|feed)://[^/].*", url) is not None def normalizeFeedURL(url): # Valid URL are returned as-is if validateFeedURL(url): return url # Check valid schemes with invalid separator match = re.match(r"^(http|https|feed):/*(.*)$", url) if match is not None: return "%s://%s" % match.group(1,2) # Replace invalid schemes by http match = re.match(r"^(.*:/*)*(.*)$", url) if match is not None: return "http://%s" % match.group(2) # We weren't able to normalize print "DTV: unable to normalize URL %s" % url return url ## # Generates an appropriate feed for a URL # # @param url The URL of the feed def generateFeed(url,ufeed): thread = Thread(target=lambda: _generateFeed(url,ufeed), \ name="generateFeed -- %s" % url) thread.setDaemon(False) thread.start() def _generateFeed(url, ufeed, visible=True): if (url == "dtv:directoryfeed"): return DirectoryFeedImpl(ufeed) elif (url == "dtv:search"): return SearchFeedImpl(ufeed) elif (url == "dtv:searchDownloads"): return SearchDownloadsFeedImpl(ufeed) info = grabURL(url,"GET") if info is None: return None try: modified = info['last-modified'] except KeyError: modified = None try: etag = info['etag'] except KeyError: etag = None #Definitely an HTML feed if (info['content-type'].startswith('text/html') or info['content-type'].startswith('application/xhtml+xml')): #print "Scraping HTML" html = info['file-handle'].read() if info.has_key('charset'): html = fixHTMLHeader(html,info['charset']) charset = info['charset'] else: charset = None info['file-handle'].close() if delegate.isScrapeAllowed(url): return ScraperFeedImpl(info['updated-url'],initialHTML=html,etag=etag,modified=modified, charset=charset, visible=visible, ufeed=ufeed) else: return None #It's some sort of feed we don't know how to scrape elif (info['content-type'].startswith('application/rdf+xml') or info['content-type'].startswith('application/atom+xml')): #print "ATOM or RDF" html = info['file-handle'].read() info['file-handle'].close() if info.has_key('charset'): xmldata = fixXMLHeader(html,info['charset']) else: xmldata = html return RSSFeedImpl(info['updated-url'],initialHTML=xmldata,etag=etag,modified=modified, visible=visible, ufeed=ufeed) # If it's not HTML, we can't be sure what it is. # # If we get generic XML, it's probably RSS, but it still could be # XHTML. # # application/rss+xml links are definitely feeds. However, they # might be pre-enclosure RSS, so we still have to download them # and parse them before we can deal with them correctly. elif (info['content-type'].startswith('application/rss+xml') or info['content-type'].startswith('application/podcast+xml') or info['content-type'].startswith('text/xml') or info['content-type'].startswith('application/xml')): #print " It's doesn't look like HTML..." html = info["file-handle"].read() info["file-handle"].close() if info.has_key('charset'): xmldata = fixXMLHeader(html,info['charset']) html = fixHTMLHeader(html,info['charset']) charset = info['charset'] else: xmldata = html charset = None try: parser = xml.sax.make_parser() parser.setFeature(xml.sax.handler.feature_namespaces, 1) handler = RSSLinkGrabber(info['redirected-url'],charset) parser.setContentHandler(handler) parser.parse(StringIO(xmldata)) except xml.sax.SAXException: #it doesn't parse as RSS, so it must be HTML #print " Nevermind! it's HTML" if delegate.isScrapeAllowed(url): return ScraperFeedImpl(info['updated-url'],initialHTML=html,etag=etag,modified=modified, charset=charset, visible=visible, ufeed=ufeed) else: return None except UnicodeDecodeError: print "Unicode issue parsing... %s" % xmldata[0:300] traceback.print_exc() return None if handler.enclosureCount > 0 or handler.itemCount == 0: #print " It's RSS with enclosures" return RSSFeedImpl(info['updated-url'],initialHTML=xmldata,etag=etag,modified=modified, visible=visible, ufeed=ufeed) else: #print " It's pre-enclosure RSS" if delegate.isScrapeAllowed(url): return ScraperFeedImpl(info['updated-url'],initialHTML=xmldata,etag=etag,modified=modified, charset=charset, visible=visible, ufeed=ufeed) else: return None else: print "DTV doesn't know how to deal with "+info['content-type']+" feeds" return None ## # Handle configuration changes so we can update feed update frequencies def configDidChange(key, value): if key is config.CHECK_CHANNELS_EVERY_X_MN.key: for feed in app.globalViewList['feeds']: updateFreq = 0 try: updateFreq = feed.parsed["feed"]["ttl"] except: pass feed.setUpdateFrequency(updateFreq) config.addChangeCallback(configDidChange) ## # Actual implementation of a basic feed. class FeedImpl: def __init__(self, url, ufeed, title = None, visible = True): self.available = 0 self.unwatched = 0 self.url = url self.ufeed = ufeed self.items = [] if title == None: self.title = url else: self.title = title self.created = datetime.now() self.autoDownloadable = ufeed.initiallyAutoDownloadable if self.autoDownloadable: self.startfrom = datetime.min else: self.startfrom = datetime.max self.getEverything = False self.maxNew = -1 self.fallBehind = -1 self.expire = "system" self.visible = visible self.updating = False self.lastViewed = datetime.min self.thumbURL = defaultFeedIconURL() self.updateFreq = config.get(config.CHECK_CHANNELS_EVERY_X_MN)*60 # Sets the update frequency (in minutes). # - A frequency of -1 means that auto-update is disabled. def setUpdateFrequency(self, frequency): if frequency < 0: self.cancelUpdateEvents() self.updateFreq = -1 else: newFreq = max(config.get(config.CHECK_CHANNELS_EVERY_X_MN), frequency)*60 if newFreq != self.updateFreq: self.updateFreq = newFreq self.scheduleUpdateEvents(-1) def scheduleUpdateEvents(self, firstTriggerDelay): self.cancelUpdateEvents() if self.updateFreq > 0: self.scheduler = ScheduleEvent(self.updateFreq, self.update) if firstTriggerDelay >= 0: ScheduleEvent(firstTriggerDelay, self.update, False) def cancelUpdateEvents(self): try: self.scheduler.remove() self.scheduler = None except: pass # Subclasses should implement this def update(self): pass # Returns true iff this feed has been looked at def getViewed(self): ret = self.lastViewed != datetime.min return ret # Returns the ID of the actual feed, never that of the UniversalFeed wrapper def getFeedID(self): return self.getID() def getID(self): try: return self.ufeed.getID() except: print "%s has no ufeed" % self # Returns true if x is a newly available item, otherwise returns false def isAvailable(self, x): return x.creationTime > self.lastViewed and (x.getState() == 'stopped' or x.getState() == 'downloading') # Returns true if x is an unwatched item, otherwise returns false def isUnwatched(self, x): state = x.getState() return state == 'finished' or state == 'uploading' # Updates the state of unwatched and available items to meet # Returns true iff endChange() is called def updateUandA(self): # Note: I'm not locking this with the assumption that we don't # care if these totals reflect an actual snapshot of the # database. If items change in the middle of this, oh well. newU = 0 newA = 0 ret = False for item in self.items: if self.isAvailable(item): newA += 1 if self.isUnwatched(item): newU += 1 self.ufeed.beginRead() try: if newU != self.unwatched or newA != self.available: self.ufeed.beginChange() try: ret = True self.unwatched = newU self.available = newA finally: self.ufeed.endChange() finally: self.ufeed.endRead() return ret # Returns string with number of unwatched videos in feed def numUnwatched(self): return self.unwatched # Returns string with number of available videos in feed def numAvailable(self): return self.available # Returns true iff both unwatched and available numbers should be shown def showBothUAndA(self): return ((not self.isAutoDownloadable()) and self.unwatched > 0 and self.available > 0) # Returns true iff unwatched should be shown and available shouldn't def showOnlyU(self): return ((self.unwatched > 0 and self.available == 0) or (self.isAutoDownloadable() and self.unwatched > 0)) # Returns true iff available should be shown and unwatched shouldn't def showOnlyA(self): return ((not self.isAutoDownloadable()) and self.unwatched == 0 and self.available > 0) # Returns true iff neither unwatched nor available should be shown def showNeitherUNorA(self): return (self.unwatched == 0 and (self.isAutoDownloadable() or self.available == 0)) ## # Sets the last time the feed was viewed to now def markAsViewed(self): # FIXME uncomment to make "new" state last 6 hours. See #655, #733 self.lastViewed = datetime.now() #- timedelta(hours=6) self.updateUandA() ## # Returns true iff the feed is loading. Only makes sense in the # context of UniversalFeeds def isLoading(self): return False ## # Returns true iff this feed has a library def hasLibrary(self): return False ## # Downloads the next available item taking into account maxNew, # fallbehind, and getEverything def downloadNextAuto(self, dontUse = []): self.ufeed.beginRead() try: next = None #The number of items downloading from this feed dling = 0 #The number of items eligibile to download eligibile = 0 #The number of unwatched, downloaded items newitems = 0 #Find the next item we should get self.items.sort(sortFunc) for item in self.items: if (item.getState() == "autopending") and not item in dontUse: eligibile += 1 if next == None: next = item elif item.getPubDateParsed() > next.getPubDateParsed(): next = item if item.getState() == "downloading": dling += 1 if item.getState() == "finished" or item.getState() == "uploading" and not item.getSeen(): newitems += 1 finally: self.ufeed.endRead() if self.maxNew >= 0 and newitems >= self.maxNew: return False elif self.fallBehind>=0 and eligibile > self.fallBehind: dontUse.append(next) return self.downloadNextAuto(dontUse) elif next != None: self.ufeed.beginRead() try: self.startfrom = next.getPubDateParsed() finally: self.ufeed.endRead() next.download(autodl = True) return True else: return False def downloadNextManual(self): self.ufeed.beginRead() next = None self.items.sort(sortFunc) for item in self.items: if item.getState() == "manualpending": if next is None: next = item elif item.getPubDateParsed() < next.getPubDateParsed(): next = item if not next is None: next.download(autodl = False) self.ufeed.endRead() ## # Returns marks expired items as expired def expireItems(self): expireTime = datetime.max - datetime.min if self.expire == "feed": expireTime = self.expireTime elif self.expire == "system": expireTime = timedelta(days=config.get(config.EXPIRE_AFTER_X_DAYS)) if expireTime <= timedelta(0): return elif self.expire == "never": return for item in self.items: local = item.getFilename() is not "" expiring = datetime.now() - item.getDownloadedTime() > expireTime stateOk = item.getState() in ('finished', 'stopped', 'watched') keepIt = item.getKeep() if local and expiring and stateOk and not keepIt: item.expire() ## # Returns true iff feed should be visible def isVisible(self): self.ufeed.beginRead() try: ret = self.visible finally: self.ufeed.endRead() return ret ## # Switch the auto-downloadable state def setAutoDownloadable(self, automatic): self.ufeed.beginRead() try: self.autoDownloadable = (automatic == "1") if self.autoDownloadable: self.startfrom = datetime.now() else: self.startfrom = datetime.max finally: self.ufeed.endRead() ## # Sets the 'getEverything' attribute, True or False def setGetEverything(self, everything): self.ufeed.beginRead() try: self.getEverything = everything finally: self.ufeed.endRead() ## # Sets the expiration attributes. Valid types are 'system', 'feed' and 'never' # Expiration time is in hour(s). def setExpiration(self, type, time): self.ufeed.beginRead() try: self.expire = type self.expireTime = timedelta(hours=time) if self.expire == "never": for item in self.items: if item.getState() in ['finished','uploading','watched']: item.setKeep(True) finally: self.ufeed.endRead() ## # Sets the maxNew attributes. -1 means unlimited. def setMaxNew(self, maxNew): self.ufeed.beginRead() try: self.maxNew = maxNew finally: self.ufeed.endRead() ## # Return the 'system' expiration delay, in days (can be < 1.0) def getDefaultExpiration(self): return float(config.get(config.EXPIRE_AFTER_X_DAYS)) ## # Returns the 'system' expiration delay as a formatted string def getFormattedDefaultExpiration(self): expiration = self.getDefaultExpiration() formattedExpiration = '' if expiration < 0: formattedExpiration = 'never' elif expiration < 1.0: formattedExpiration = '%d hours' % int(expiration * 24.0) elif expiration == 1: formattedExpiration = '%d day' % int(expiration) elif expiration > 1 and expiration < 30: formattedExpiration = '%d days' % int(expiration) elif expiration >= 30: formattedExpiration = '%d months' % int(expiration / 30) return formattedExpiration ## # Returns "feed," "system," or "never" def getExpirationType(self): self.ufeed.beginRead() ret = self.expire self.ufeed.endRead() return ret ## # Returns"unlimited" or the maximum number of items this feed can fall behind def getMaxFallBehind(self): self.ufeed.beginRead() if self.fallBehind < 0: ret = "unlimited" else: ret = self.fallBehind self.ufeed.endRead() return ret ## # Returns "unlimited" or the maximum number of items this feed wants def getMaxNew(self): self.ufeed.beginRead() if self.maxNew < 0: ret = "unlimited" else: ret = self.maxNew self.ufeed.endRead() return ret ## # Returns the total absolute expiration time in hours. # WARNING: 'system' and 'never' expiration types return 0 def getExpirationTime(self): delta = None self.ufeed.beginRead() try: try: if self.expire == 'never' or (self.expire == 'system' and config.get(config.EXPIRE_AFTER_X_DAYS) <= 0): delta = timedelta() else: delta = self.expireTime except: delta = timedelta() finally: self.ufeed.endRead() return (delta.days * 24) + (delta.seconds / 3600) ## # Returns the number of days until a video expires def getExpireDays(self): ret = 0 self.ufeed.beginRead() try: try: ret = self.expireTime.days except: ret = timedelta(days=config.get(config.EXPIRE_AFTER_X_DAYS)).days finally: self.ufeed.endRead() return ret ## # Returns the number of hours until a video expires def getExpireHours(self): ret = 0 self.ufeed.beginRead() try: try: ret = int(self.expireTime.seconds/3600) except: ret = int(timedelta(days=config.get(config.EXPIRE_AFTER_X_DAYS)).seconds/3600) finally: self.ufeed.endRead() return ret ## # Returns true iff item is autodownloadable def isAutoDownloadable(self): self.ufeed.beginRead() ret = self.autoDownloadable self.ufeed.endRead() return ret def autoDownloadStatus(self): status = self.isAutoDownloadable() if status: return "ON" else: return "OFF" ## # Returns the title of the feed def getTitle(self): try: title = self.title if whitespacePattern.match(title): title = self.url return title except: return "" ## # Returns the URL of the feed def getURL(self): try: return self.url except: return "" ## # Returns the description of the feed def getDescription(self): return "<span />" ## # Returns a link to a webpage associated with the feed def getLink(self): return "" ## # Returns the URL of the library associated with the feed def getLibraryLink(self): return "" ## # Returns the URL of a thumbnail associated with the feed def getThumbnail(self): ret = self.thumbURL if ret is None or not (ret.startswith('http:') or ret.startswith('https:')): ret = defaultFeedIconURL() return ret ## # Returns URL of license assocaited with the feed def getLicense(self): return "" ## # Returns the number of new items with the feed def getNewItems(self): self.ufeed.beginRead() count = 0 for item in self.items: try: if item.getState() == 'finished' and not item.getSeen(): count += 1 except: pass self.ufeed.endRead() return count ## # This class is a magic class that can become any type of feed it wants # # It works by passing on attributes to the actual feed. class Feed(DDBObject): def __init__(self,url, initial = None, initiallyAutoDownloadable = True): self.origURL = url self.errorState = False self.initiallyAutoDownloadable = initiallyAutoDownloadable if initial is None: self.loading = True self.actualFeed = FeedImpl(url,self) DDBObject.__init__(self) thread = Thread(target=lambda: self.generateFeed(True), \ name="Feed.__init__ generate -- %s" % url) thread.setDaemon(False) thread.start() else: self.loading = False self.actualFeed = initial # Returns javascript to mark the feed as viewed # FIXME: Using setTimeout is a hack to get around JavaScript bugs # Without the timeout, the view is never completely updated def getMarkViewedJS(self): return ("function markViewed() {eventURL('action:markFeedViewed?url=%s');} setTimeout(markViewed, 5000);" % urlencode(self.getURL())) # Returns the ID of this feed. Deprecated. def getFeedID(self): return self.getID() def getID(self): return DDBObject.getID(self) def hasError(self): ret = False self.beginRead() try: ret = self.errorState finally: self.endRead() return ret def getError(self): return "Could not load feed" def update(self): self.beginRead() try: if self.loading: return elif self.errorState: self.loading = True self.errorState = False self.beginChange() self.endChange() thread = Thread(target=lambda: self.generateFeed(), \ name="Feed.update generate -- %s" % \ self.origURL) thread.setDaemon(False) thread.start() return finally: self.endRead() self.actualFeed.update() def generateFeed(self, removeOnError=False): temp = _generateFeed(self.url,self,visible=True) self.beginRead() try: self.loading = False if temp is None: self.errorState = True else: self.actualFeed = temp finally: self.endRead() if removeOnError and self.errorState: self.remove() else: self.beginChange() self.endChange() def getActualFeed(self): return self.__dict__['actualFeed'] def __getattr__(self,attr): return getattr(self.getActualFeed(),attr) def remove(self): self.beginChange() self.cancelUpdateEvents() try: DDBObject.remove(self) for item in self.items: if not item.getKeep(): item.expire() item.remove() finally: self.endChange() ## # Called by pickle during serialization def __getstate__(self): temp = copy(self.__dict__) #temp["itemlist"] = None return (3,temp) ## # Called by pickle during deserialization def __setstate__(self,state): (version, data) = state if version == 0: version += 1 if version == 1: data['thumbURL'] = defaultFeedIconURL() version += 1 if version == 2: data['lastViewed'] = datetime.min data['unwatched'] = 0 data['available'] = 0 version += 1 assert(version == 3) data['updating'] = False self.__dict__ = data # This object is useless without a FeedImpl associated with it if not data.has_key('actualFeed'): self.__class__ = DropItLikeItsHot # Dummy class to facilitate upgrade class YahooSearchFeedImpl: def __setstate__(self,state): self.__class__ = DropItLikeItsHot class RSSFeedImpl(FeedImpl): firstImageRE = re.compile('\<\s*img\s+[^>]*src\s*=\s*"(.*?)"[^>]*\>',re.I|re.M) def __init__(self,url,ufeed,title = None,initialHTML = None, etag = None, modified = None, visible=True): FeedImpl.__init__(self,url,ufeed,title,visible=visible) self.initialHTML = initialHTML self.etag = etag self.modified = modified self.scheduleUpdateEvents(0) ## # Returns the description of the feed def getDescription(self): self.ufeed.beginRead() try: ret = xhtmlify('<span>'+unescape(self.parsed.summary)+'</span>') except: ret = "<span />" self.ufeed.endRead() return ret ## # Returns a link to a webpage associated with the feed def getLink(self): self.ufeed.beginRead() try: ret = self.parsed.link except: ret = "" self.ufeed.endRead() return ret ## # Returns the URL of the library associated with the feed def getLibraryLink(self): self.ufeed.beginRead() try: ret = self.parsed.libraryLink except: ret = "" self.ufeed.endRead() return ret def hasVideoFeed(self, enclosures): hasOne = False for enclosure in enclosures: if ((enclosure.has_key('type') and (enclosure['type'].startswith('video/') or enclosure['type'].startswith('audio/') or enclosure['type'] == "application/x-bittorrent")) or (enclosure.has_key('url') and (enclosure['url'][-4:].lower() in ['.mov','.wmv','.mp4', '.m4v', '.mp3','.mpg','.avi'] or enclosure['url'][-8].lower() == '.torrent' or enclosure['url'][-5].lower() == '.mpeg'))): hasOne = True break return hasOne ## # Updates a feed def update(self): info = {} self.ufeed.beginRead() try: if self.updating: return else: self.updating = True finally: self.ufeed.endRead() if not self.initialHTML is None: html = self.initialHTML self.initialHTML = None else: info = grabURL(self.url,etag=self.etag,modified=self.modified) if info is None: self.ufeed.beginRead() try: self.updating = False finally: self.finishUpdate() return None html = info['file-handle'].read() info['file-handle'].close() if info.has_key('charset'): html = fixXMLHeader(html,info['charset']) if info['status'] == 304: self.ufeed.beginRead() try: self.updating = False finally: self.finishUpdate() return self.url = info['updated-url'] d = feedparser.parse(html) self.parsed = d self.ufeed.beginRead() try: try: self.title = self.parsed["feed"]["title"] except KeyError: try: self.title = self.parsed["channel"]["title"] except KeyError: pass if (self.parsed.feed.has_key('image') and self.parsed.feed.image.has_key('url')): self.thumbURL = self.parsed.feed.image.url for entry in self.parsed.entries: entry = self.addScrapedThumbnail(entry) new = True for item in self.items: try: if item.getRSSID() == entry["id"]: item.update(entry) new = False except KeyError: # If the item changes at all, it results in a # new entry if (item.getRSSEntry() == entry): item.update(entry) new = False if (new and entry.has_key('enclosures') and self.hasVideoFeed(entry.enclosures)): self.items.append(Item(self.ufeed,entry)) try: updateFreq = self.parsed["feed"]["ttl"] except KeyError: updateFreq = 0 self.setUpdateFrequency(updateFreq) self.updating = False finally: self.finishUpdate(info) def finishUpdate(self, info=None): if info is not None: if info.has_key('etag'): self.etag = info['etag'] if info.has_key('last-modified'): self.modified = info['last-modified'] self.ufeed.endRead() #FIXMENOW This is sloow... if not self.updateUandA(): self.ufeed.beginChange() self.ufeed.endChange() def addScrapedThumbnail(self,entry): if (entry.has_key('enclosures') and len(entry['enclosures'])>0 and entry.has_key('description') and not entry['enclosures'][0].has_key('thumbnail')): desc = RSSFeedImpl.firstImageRE.search(unescape(entry['description'])) if not desc is None: entry['enclosures'][0]['thumbnail'] = FeedParserDict({'url': desc.expand("\\1")}) return entry ## # Returns the URL of the license associated with the feed def getLicense(self): try: ret = self.parsed.license except: ret = "" return ret ## # Called by pickle during serialization def __getstate__(self): temp = copy(self.__dict__) temp["scheduler"] = None if temp.has_key('parsed') and 'bozo_exception' in temp['parsed']: # This can end up pointing into the XML parser, leading to # a pickling failure. del temp['parsed']['bozo_exception'] #temp["itemlist"] = None return (0,temp) ## # Called by pickle during deserialization def __setstate__(self,state): (version, data) = state assert(version == 0) data['updating'] = False self.__dict__ = data #self.itemlist = defaultDatabase.filter(lambda x:isinstance(x,Item) and x.feed is self) #FIXME: the update dies if all of the items aren't restored, so we # wait a little while before we start the update self.scheduleUpdateEvents(0.1) ## # A DTV Collection of items -- similar to a playlist class Collection(FeedImpl): def __init__(self,ufeed,title = None): FeedImpl.__init__(self,ufeed,url = "dtv:collection",title = title,visible = False) ## # Adds an item to the collection def addItem(self,item): if isinstance(item,Item): self.ufeed.beginRead() try: self.removeItem(item) self.items.append(item) finally: self.ufeed.endRead() return True else: return False ## # Moves an item to another spot in the collection def moveItem(self,item,pos): self.ufeed.beginRead() try: self.removeItem(item) if pos < len(self.items): self.items[pos:pos] = [item] else: self.items.append(item) finally: self.ufeed.endRead() ## # Removes an item from the collection def removeItem(self,item): self.ufeed.beginRead() try: for x in range(0,len(self.items)): if self.items[x] == item: self.items[x:x+1] = [] break finally: self.ufeed.endRead() return True ## # A feed based on un unformatted HTML or pre-enclosure RSS class ScraperFeedImpl(FeedImpl): #FIXME: change this to a higher number once we optimize a bit maxThreads = 1 def __init__(self,url,ufeed, title = None, visible = True, initialHTML = None,etag=None,modified = None,charset = None): FeedImpl.__init__(self,url,ufeed,title,visible) self.initialHTML = initialHTML self.initialCharset = charset self.linkHistory = {} self.linkHistory[url] = {} self.tempHistory = {} if not etag is None: self.linkHistory[url]['etag'] = etag if not modified is None: self.linkHistory[url]['modified'] = modified self.semaphore = Semaphore(ScraperFeedImpl.maxThreads) self.scheduleUpdateEvents(0) self.setUpdateFrequency(360) def getMimeType(self,link): info = grabURL(link,"HEAD") if info is None: return '' else: return info['content-type'] ## # This puts all of the caching information in tempHistory into the # linkHistory. This should be called at the end of an updated so that # the next time we update we don't unnecessarily follow old links def saveCacheHistory(self): self.ufeed.beginRead() try: for url in self.tempHistory.keys(): self.linkHistory[url] = self.tempHistory[url] self.tempHistory = {} finally: self.ufeed.endRead() ## # returns a tuple containing the text of the URL, the url (in case # of a permanent redirect), a redirected URL (in case of # temporary redirect)m and the download status def getHTML(self, url, useActualHistory = True): etag = None modified = None if self.linkHistory.has_key(url): if self.linkHistory[url].has_key('etag'): etag = self.linkHistory[url]['etag'] if self.linkHistory[url].has_key('modified'): modified = self.linkHistory[url]['modified'] info = grabURL(url, etag=etag, modified=modified) if info is None: return (None, url, url,404, None) else: if not self.tempHistory.has_key(info['updated-url']): self.tempHistory[info['updated-url']] = {} if info.has_key('etag'): self.tempHistory[info['updated-url']]['etag'] = info['etag'] if info.has_key('last-modified'): self.tempHistory[info['updated-url']]['modified'] = info['last-modified'] html = info['file-handle'].read() #print "Scraper got HTML of length "+str(len(html)) info['file-handle'].close() #print "Closed" if info.has_key('charset'): return (html, info['updated-url'],info['redirected-url'],info['status'],info['charset']) else: return (html, info['updated-url'],info['redirected-url'],info['status'],None) def addVideoItem(self,link,dict,linkNumber): link = link.strip() if dict.has_key('title'): title = dict['title'] else: title = link for item in self.items: if item.getURL() == link: return if dict.has_key('thumbnail') > 0: i=Item(self.ufeed, FeedParserDict({'title':title,'enclosures':[FeedParserDict({'url':link,'thumbnail':FeedParserDict({'url':dict['thumbnail']})})]}),linkNumber = linkNumber) else: i=Item(self.ufeed, FeedParserDict({'title':title,'enclosures':[FeedParserDict({'url':link})]}),linkNumber = linkNumber) self.items.append(i) if not self.updateUandA(): self.ufeed.beginChange() self.ufeed.endChange() def makeProcessLinkFunc(self,subLinks,depth,linkNumber): return lambda: self.processLinksThenFreeSem(subLinks,depth,linkNumber) def processLinksThenFreeSem(self,subLinks,depth,linkNumber): try: self.processLinks(subLinks, depth,linkNumber) finally: #print "Releasing semaphore" self.semaphore.release() #FIXME: compound names for titles at each depth?? def processLinks(self,links, depth = 0,linkNumber = 0): maxDepth = 2 urls = links[0] links = links[1] if depth<maxDepth: for link in urls: if depth == 0: linkNumber += 1 #print "Processing %s (%d)" % (link,linkNumber) # FIXME: Using file extensions totally breaks the # standard and won't work with Broadcast Machine or # Blog Torrent. However, it's also a hell of a lot # faster than checking the mime type for every single # file, so for now, we're being bad boys. Uncomment # the elif to make this use mime types for HTTP GET URLs if ((link[-4:].lower() in ['.mov','.wmv','.mp4','.m4v','.mp3','.mpg','.avi']) or (link[-5:].lower() in ['.mpeg'])): mimetype = 'video/unknown' elif link[-8:].lower() == '.torrent': mimetype = "application/x-bittorrent" #elif link.find('?') > 0 and link.lower().find('.htm') == -1: # mimetype = self.getMimeType(link) # #print " mimetype is "+mimetype else: mimetype = 'text/html' if mimetype != None: #This is text of some sort: HTML, XML, etc. if ((mimetype.startswith('text/html') or mimetype.startswith('application/xhtml+xml') or mimetype.startswith('text/xml') or mimetype.startswith('application/xml') or mimetype.startswith('application/rss+xml') or mimetype.startswith('application/podcast+xml') or mimetype.startswith('application/atom+xml') or mimetype.startswith('application/rdf+xml') ) and depth < maxDepth -1): (html, url, redirURL,status,charset) = self.getHTML(link) if status == 304: #It's cached pass elif not html is None: subLinks = self.scrapeLinks(html, redirURL,charset=charset) if depth == 0: self.semaphore.acquire() #print "Acquiring semaphore" thread = Thread(target = self.makeProcessLinkFunc(subLinks,depth+1,linkNumber), \ name = "scraper processLinks -- %s" % self.url) thread.setDaemon(False) thread.start() else: self.processLinks(subLinks,depth+1,linkNumber) else: pass #print link+" seems to be bogus..." #This is a video elif (mimetype.startswith('video/') or mimetype.startswith('audeo/') or mimetype == "application/x-bittorrent"): self.addVideoItem(link, links[link],linkNumber) #FIXME: go through and add error handling def update(self): self.ufeed.beginRead() try: if self.updating: return else: self.updating = True finally: self.ufeed.endRead() if not self.initialHTML is None: html = self.initialHTML self.initialHTML = None redirURL=self.url status = 200 charset = self.initialCharset self.initialCharset = None else: (html,url, redirURL, status,charset) = self.getHTML(self.url) if not status == 304: if not html is None: links = self.scrapeLinks(html, redirURL, setTitle=True,charset=charset) self.processLinks(links) #Download the HTML associated with each page self.ufeed.beginRead() try: self.saveCacheHistory() self.updating = False finally: self.ufeed.endRead() def scrapeLinks(self,html,baseurl,setTitle = False,charset = None): try: if not charset is None: xmldata = fixXMLHeader(html,charset) html = fixHTMLHeader(html,charset) else: xmldata = html parser = xml.sax.make_parser() parser.setFeature(xml.sax.handler.feature_namespaces, 1) if not charset is None: handler = RSSLinkGrabber(baseurl,charset) else: handler = RSSLinkGrabber(baseurl) parser.setContentHandler(handler) try: parser.parse(StringIO(xmldata)) except IOError, e: pass links = handler.links linkDict = {} for link in links: if link[0].startswith('http://') or link[0].startswith('https://'): if not linkDict.has_key(toUTF8Bytes(link[0],charset)): linkDict[toUTF8Bytes(link[0])] = {} if not link[1] is None: linkDict[toUTF8Bytes(link[0])]['title'] = toUTF8Bytes(link[1],charset).strip() if not link[2] is None: linkDict[toUTF8Bytes(link[0])]['thumbnail'] = toUTF8Bytes(link[2],charset) if setTitle and not handler.title is None: self.ufeed.beginChange() try: self.title = toUTF8Bytes(handler.title) finally: self.ufeed.endChange() return ([x[0] for x in links if x[0].startswith('http://') or x[0].startswith('https://')], linkDict) except (xml.sax.SAXException, IOError): (links, linkDict) = self.scrapeHTMLLinks(html,baseurl,setTitle=setTitle, charset=charset) return (links, linkDict) ## # Given a string containing an HTML file, return a dictionary of # links to titles and thumbnails def scrapeHTMLLinks(self,html, baseurl,setTitle=False, charset = None): #print "Scraping "+baseurl+" as HTML" lg = HTMLLinkGrabber() links = lg.getLinks(html, baseurl) if setTitle and not lg.title is None: self.ufeed.beginChange() try: self.title = toUTF8Bytes(lg.title) finally: self.ufeed.endChange() linkDict = {} for link in links: if link[0].startswith('http://') or link[0].startswith('https://'): if not linkDict.has_key(toUTF8Bytes(link[0],charset)): linkDict[toUTF8Bytes(link[0])] = {} if not link[1] is None: linkDict[toUTF8Bytes(link[0])]['title'] = toUTF8Bytes(link[1],charset).strip() if not link[2] is None: linkDict[toUTF8Bytes(link[0])]['thumbnail'] = toUTF8Bytes(link[2],charset) return ([x[0] for x in links if x[0].startswith('http://') or x[0].startswith('https://')],linkDict) ## # Called by pickle during serialization def __getstate__(self): temp = copy(self.__dict__) temp['semaphore'] = None temp["scheduler"] = None #temp["itemlist"] = None return (0,temp) ## # Called by pickle during deserialization def __setstate__(self,state): (version, data) = state assert(version == 0) data['updating'] = False data['tempHistory'] = {} self.__dict__ = data #self.itemlist = defaultDatabase.filter(lambda x:isinstance(x,Item) and x.feed is self) #FIXME: the update dies if all of the items aren't restored, so we # wait a little while before we start the update self.scheduleUpdateEvents(.1) self.semaphore = Semaphore(ScraperFeedImpl.maxThreads) ## # A feed of all of the Movies we find in the movie folder that don't # belong to a "real" feed # # FIXME: How do we trigger updates on this feed? class DirectoryFeedImpl(FeedImpl): def __init__(self,ufeed): FeedImpl.__init__(self,url = "dtv:directoryfeed",ufeed=ufeed,title = "Feedless Videos",visible = False) self.setUpdateFrequency(5) self.scheduleUpdateEvents(0) ## # Directory Items shouldn't automatically expire def expireItems(self): pass def setUpdateFrequency(self, frequency): newFreq = frequency*60 if newFreq != self.updateFreq: self.updateFreq = newFreq self.scheduleUpdateEvents(-1) ## # Returns a list of all of the files in a given directory def getFileList(self,dir): allthefiles = [] for root, dirs, files in os.walk(dir,topdown=True): if root == dir and 'Incomplete Downloads' in dirs: dirs.remove('Incomplete Downloads') toRemove = [] for curdir in dirs: if curdir[0] == '.': toRemove.append(curdir) for curdir in toRemove: dirs.remove(curdir) toRemove = [] for curfile in files: if curfile[0] == '.': toRemove.append(curfile) for curfile in toRemove: files.remove(curfile) allthefiles[:0] = map(lambda x:os.path.normcase(os.path.join(root,x)),files) return allthefiles def update(self): self.ufeed.beginRead() try: if self.updating: return else: self.updating = True finally: self.ufeed.endRead() knownFiles = [] #Files on the filesystem existingFiles = self.getFileList(config.get(config.MOVIES_DIRECTORY)) #Files known about by real feeds for item in app.globalViewList['items']: if not item.feed is self.ufeed: knownFiles[:0] = item.getFilenames() knownFiles = map(os.path.normcase,knownFiles) #Remove items that are in feeds, but we have in our list for x in range(0,len(self.items)): try: while (self.items[x].getFilename() in knownFiles) or (not self.items[x].getFilename() in existingFiles): self.items[x].remove() self.items[x:x+1] = [] except IndexError: pass #Files on the filesystem that we known about myFiles = map(lambda x:x.getFilename(),self.items) #Adds any files we don't know about for file in existingFiles: if not file in knownFiles and not file in myFiles: self.items.append(FileItem(self.ufeed,file)) self.updating = False ## # Called by pickle during serialization def __getstate__(self): temp = copy(self.__dict__) temp["scheduler"] = None return (0,temp) def __setstate__(self,state): (version, data) = state assert(version == 0) data['updating'] = False self.__dict__ = data #FIXME: the update dies if all of the items aren't restored, so we # wait a little while before we start the update self.scheduleUpdateEvents(.1) ## # Search and Search Results feeds class SearchFeedImpl (RSSFeedImpl): def __init__(self, ufeed): RSSFeedImpl.__init__(self, url='', ufeed=ufeed, title='dtv:search', visible=False) self.setUpdateFrequency(-1) self.setAutoDownloadable(False) self.searching = False self.lastEngine = 'yahoo' self.lastQuery = '' def getStatus(self): status = 'idle-empty' if self.searching: status = 'searching' elif len(self.items) > 0: status = 'idle-with-results' return status def reset(self, url='', searchState=False): self.ufeed.beginChange() try: for item in self.items: item.remove() self.items = [] self.url = url self.searching = searchState finally: self.ufeed.endChange() def preserveDownloads(self, downloadsFeed): self.ufeed.beginRead() try: allItems = [] + self.items for item in allItems: if item.getState() != 'stopped': downloadsFeed.addItem(item) finally: self.ufeed.endRead() def lookup(self, engine, query): url = self.getRequestURL(engine, query) self.reset(url, True) self.lastQuery = query thread = Thread(target=self.update, \ name = "%s search -- %s" % (engine, query)) thread.setDaemon(False) thread.start() def getRequestURL(self, engine, query, filterAdultContents=True, limit=50): if query == "LET'S TEST DTV'S CRASH REPORTER TODAY": someVariable = intentionallyUndefinedVariableToTestCrashReporter if engine == 'yahoo': url = "http://api.search.yahoo.com/VideoSearchService/rss/videoSearch.xml" url += "?appid=dtv_search" url += "&adult_ok=%d" % int(not filterAdultContents) url += "&results=%d" % limit url += "&format=any" url += "&query=%s" % urlencode(query) elif engine == 'blogdigger': url = "http://blogdigger.com/media/rss.jsp" url += "?q=%s" % urlencode(query) url += "&media=video" url += "&media=torrent" url += "&sortby=date" return url def update(self): if self.url is not None and self.url != '': RSSFeedImpl.update(self) def finishUpdate(self, info=None): self.searching = False RSSFeedImpl.finishUpdate(self, info) class SearchDownloadsFeedImpl (FeedImpl): def __init__(self, ufeed): FeedImpl.__init__(self, url='dtv:searchDownloads', ufeed=ufeed, title=None, visible=False) self.setUpdateFrequency(-1) def addItem(self, item): self.ufeed.beginRead() try: if not item in self.items: item.beginRead() try: item.feed.items.remove(item) item.feed = self.ufeed finally: item.endRead() self.items.append(item) finally: self.ufeed.endRead() ## # Parse HTML document and grab all of the links and their title # FIXME: Grab link title from ALT tags in images # FIXME: Grab document title from TITLE tags class HTMLLinkGrabber(HTMLParser): linkPattern = re.compile("^.*?<(a|embed)\s.*?(href|src)\s*=\s*\"(.*?)\".*?>(.*?)</a>(.*)$", re.S) imgPattern = re.compile(".*<img\s.*?src\s*=\s*\"(.*?)\".*?>", re.S) tagPattern = re.compile("<.*?>") def getLinks(self,data, baseurl): self.links = [] self.lastLink = None self.inLink = False self.inObject = False self.baseurl = baseurl self.inTitle = False self.title = None self.thumbnailUrl = None match = HTMLLinkGrabber.linkPattern.match(data) while match: link = urljoin(baseurl, match.group(3)) desc = match.group(4) imgMatch = HTMLLinkGrabber.imgPattern.match(desc) if imgMatch: thumb = urljoin(baseurl, imgMatch.group(1)) else: thumb = None desc = HTMLLinkGrabber.tagPattern.sub(' ',desc) self.links.append( (link, desc, thumb)) match = HTMLLinkGrabber.linkPattern.match(match.group(5)) return self.links class RSSLinkGrabber(xml.sax.handler.ContentHandler): def __init__(self,baseurl,charset=None): self.baseurl = baseurl self.charset = charset def startDocument(self): #print "Got start document" self.enclosureCount = 0 self.itemCount = 0 self.links = [] self.inLink = False self.inDescription = False self.inTitle = False self.inItem = False self.descHTML = '' self.theLink = '' self.title = None self.firstTag = True def startElementNS(self, name, qname, attrs): (uri, tag) = name if self.firstTag: self.firstTag = False if tag != 'rss': raise xml.sax.SAXNotRecognizedException, "Not an RSS file" if tag.lower() == 'enclosure' or tag.lower() == 'content': self.enclosureCount += 1 elif tag.lower() == 'link': self.inLink = True self.theLink = '' elif tag.lower() == 'description': self.inDescription = True self.descHTML = '' elif tag.lower() == 'item': self.itemCount += 1 self.inItem = True elif tag.lower() == 'title' and not self.inItem: self.inTitle = True def endElementNS(self, name, qname): (uri, tag) = name if tag.lower() == 'description': lg = HTMLLinkGrabber() try: html = xhtmlify(unescape(self.descHTML),addTopTags=True) if not self.charset is None: html = fixHTMLHeader(html,self.charset) self.links[:0] = lg.getLinks(html,self.baseurl) except HTMLParseError: # Don't bother with bad HTML print "DTV: bad HTML in %s" % self.baseurl self.inDescription = False elif tag.lower() == 'link': self.links.append((self.theLink,None,None)) self.inLink = False elif tag.lower() == 'item': self.inItem == False elif tag.lower() == 'title' and not self.inItem: self.inTitle = False def characters(self, data): if self.inDescription: self.descHTML += data elif self.inLink: self.theLink += data elif self.inTitle: if self.title is None: self.title = data else: self.title += data # Grabs the feed link from the given webpage class HTMLFeedURLParser(HTMLParser): def getLink(self,baseurl,data): self.baseurl = baseurl self.link = None try: self.feed(data) except HTMLParseError: print "DTV: error parsing "+str(baseurl) try: self.close() except HTMLParseError: print "DTV: error closing "+str(baseurl) return self.link def handle_starttag(self, tag, attrs): attrdict = {} for (key, value) in attrs: attrdict[key.lower()] = value if (tag.lower() == 'link' and attrdict.has_key('rel') and attrdict.has_key('type') and attrdict.has_key('href') and attrdict['rel'].lower() == 'alternate' and attrdict['type'].lower() in ['application/rss+xml', 'application/podcast+xml', 'application/rdf+xml', 'application/atom+xml', 'text/xml', 'application/xml']): self.link = urljoin(self.baseurl,attrdict['href']) class UniversalFeed: def __setstate__(self, state): (version, data) = state if version == 0: data['errorState'] = False version += 1 if version == 1: version += 1 assert(version == 2) self.__dict__ = data self.__class__ = Feed for key, val in Feed.__dict__.iteritems(): if isfunction(val): instancemethod(val, self,Feed) self.actualFeed.ufeed = self # UniversalFeeds should never contain Feeds. If they do, # something is wrong. if isinstance(self.actualFeed, Feed): self.__class__ = DropItLikeItsHot else: # FIXME: this assumes that the feed object is decoded before # it's items self.actualFeed.ufeed = self class ScraperFeed(ScraperFeedImpl): ## # Called by pickle during deserialization def __setstate__(self,state): (version, data) = state if version == 0: version += 1 if version == 1: data['thumbURL'] = defaultFeedIconURL() version += 1 if version == 2: data['lastViewed'] = datetime.min data['unwatched'] = 0 data['available'] = 0 version += 1 assert(version == 3) data['updating'] = False data['tempHistory'] = {} data['visible'] = True self.__dict__ = data self.__class__ = ScraperFeedImpl class DirectoryFeed(DirectoryFeedImpl): ## # Called by pickle during deserialization def __setstate__(self,state): (version, data) = state if version == 0: data['thumbURL'] = defaultFeedIconURL() version += 1 if version == 1: data['lastViewed'] = datetime.min data['unwatched'] = 0 data['available'] = 0 version += 1 assert(version == 2) data['updating'] = False self.__dict__ = data self.__class__ = DirectoryFeedImpl class RSSFeed(RSSFeedImpl): ## # Called by pickle during deserialization def __setstate__(self,state): (version, data) = state if version == 0: version += 1 if version == 1: data['thumbURL'] = defaultFeedIconURL() version += 1 if version == 2: data['lastViewed'] = datetime.min data['unwatched'] = 0 data['available'] = 0 version += 1 assert(version == 3) data['updating'] = False data['visible'] = True self.__dict__ = data self.__class__ = RSSFeedImpl